import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"
pio.renderers.default = "notebook"
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.functions import to_date, count
from pyspark.sql.functions import col, explode, split, count
from pyspark.sql.functions import col, count, expr
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import col, count
import plotly.graph_objects as go
import osModule 04: Lab 01
Visual Reporting and Storytelling
Objectives
By the end of this lab, you will: 1. Load and analyze the Lightcast dataset in Spark DataFrame. 2. Create five easy and three medium-complexity visualizations using Plotly. 3. Explore salary distributions, employment trends, and job postings. 4. Analyze skills in relation to NAICS/SOC/ONET codes and salaries. 5. Customize colors, fonts, and styles in all visualizations (default themes result in a 2.5-point deduction). 6. Follow best practices for reporting on data communication.
Step 1: Load the Dataset
# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()
# Load Data
df = spark.read.option("header", "true") \
.option("inferSchema", "true") \
.option("multiLine", "true") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.csv("./data/lightcast_job_postings.csv")
# Show Schema and Sample Data
df.printSchema()
df.show(5)root
|-- ID: string (nullable = true)
|-- LAST_UPDATED_DATE: string (nullable = true)
|-- LAST_UPDATED_TIMESTAMP: timestamp (nullable = true)
|-- DUPLICATES: integer (nullable = true)
|-- POSTED: string (nullable = true)
|-- EXPIRED: string (nullable = true)
|-- DURATION: integer (nullable = true)
|-- SOURCE_TYPES: string (nullable = true)
|-- SOURCES: string (nullable = true)
|-- URL: string (nullable = true)
|-- ACTIVE_URLS: string (nullable = true)
|-- ACTIVE_SOURCES_INFO: string (nullable = true)
|-- TITLE_RAW: string (nullable = true)
|-- BODY: string (nullable = true)
|-- MODELED_EXPIRED: string (nullable = true)
|-- MODELED_DURATION: integer (nullable = true)
|-- COMPANY: integer (nullable = true)
|-- COMPANY_NAME: string (nullable = true)
|-- COMPANY_RAW: string (nullable = true)
|-- COMPANY_IS_STAFFING: boolean (nullable = true)
|-- EDUCATION_LEVELS: string (nullable = true)
|-- EDUCATION_LEVELS_NAME: string (nullable = true)
|-- MIN_EDULEVELS: integer (nullable = true)
|-- MIN_EDULEVELS_NAME: string (nullable = true)
|-- MAX_EDULEVELS: integer (nullable = true)
|-- MAX_EDULEVELS_NAME: string (nullable = true)
|-- EMPLOYMENT_TYPE: integer (nullable = true)
|-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
|-- MIN_YEARS_EXPERIENCE: integer (nullable = true)
|-- MAX_YEARS_EXPERIENCE: integer (nullable = true)
|-- IS_INTERNSHIP: boolean (nullable = true)
|-- SALARY: integer (nullable = true)
|-- REMOTE_TYPE: integer (nullable = true)
|-- REMOTE_TYPE_NAME: string (nullable = true)
|-- ORIGINAL_PAY_PERIOD: string (nullable = true)
|-- SALARY_TO: integer (nullable = true)
|-- SALARY_FROM: integer (nullable = true)
|-- LOCATION: string (nullable = true)
|-- CITY: string (nullable = true)
|-- CITY_NAME: string (nullable = true)
|-- COUNTY: integer (nullable = true)
|-- COUNTY_NAME: string (nullable = true)
|-- MSA: integer (nullable = true)
|-- MSA_NAME: string (nullable = true)
|-- STATE: integer (nullable = true)
|-- STATE_NAME: string (nullable = true)
|-- COUNTY_OUTGOING: integer (nullable = true)
|-- COUNTY_NAME_OUTGOING: string (nullable = true)
|-- COUNTY_INCOMING: integer (nullable = true)
|-- COUNTY_NAME_INCOMING: string (nullable = true)
|-- MSA_OUTGOING: integer (nullable = true)
|-- MSA_NAME_OUTGOING: string (nullable = true)
|-- MSA_INCOMING: integer (nullable = true)
|-- MSA_NAME_INCOMING: string (nullable = true)
|-- NAICS2: integer (nullable = true)
|-- NAICS2_NAME: string (nullable = true)
|-- NAICS3: integer (nullable = true)
|-- NAICS3_NAME: string (nullable = true)
|-- NAICS4: integer (nullable = true)
|-- NAICS4_NAME: string (nullable = true)
|-- NAICS5: integer (nullable = true)
|-- NAICS5_NAME: string (nullable = true)
|-- NAICS6: integer (nullable = true)
|-- NAICS6_NAME: string (nullable = true)
|-- TITLE: string (nullable = true)
|-- TITLE_NAME: string (nullable = true)
|-- TITLE_CLEAN: string (nullable = true)
|-- SKILLS: string (nullable = true)
|-- SKILLS_NAME: string (nullable = true)
|-- SPECIALIZED_SKILLS: string (nullable = true)
|-- SPECIALIZED_SKILLS_NAME: string (nullable = true)
|-- CERTIFICATIONS: string (nullable = true)
|-- CERTIFICATIONS_NAME: string (nullable = true)
|-- COMMON_SKILLS: string (nullable = true)
|-- COMMON_SKILLS_NAME: string (nullable = true)
|-- SOFTWARE_SKILLS: string (nullable = true)
|-- SOFTWARE_SKILLS_NAME: string (nullable = true)
|-- ONET: string (nullable = true)
|-- ONET_NAME: string (nullable = true)
|-- ONET_2019: string (nullable = true)
|-- ONET_2019_NAME: string (nullable = true)
|-- CIP6: string (nullable = true)
|-- CIP6_NAME: string (nullable = true)
|-- CIP4: string (nullable = true)
|-- CIP4_NAME: string (nullable = true)
|-- CIP2: string (nullable = true)
|-- CIP2_NAME: string (nullable = true)
|-- SOC_2021_2: string (nullable = true)
|-- SOC_2021_2_NAME: string (nullable = true)
|-- SOC_2021_3: string (nullable = true)
|-- SOC_2021_3_NAME: string (nullable = true)
|-- SOC_2021_4: string (nullable = true)
|-- SOC_2021_4_NAME: string (nullable = true)
|-- SOC_2021_5: string (nullable = true)
|-- SOC_2021_5_NAME: string (nullable = true)
|-- LOT_CAREER_AREA: integer (nullable = true)
|-- LOT_CAREER_AREA_NAME: string (nullable = true)
|-- LOT_OCCUPATION: integer (nullable = true)
|-- LOT_OCCUPATION_NAME: string (nullable = true)
|-- LOT_SPECIALIZED_OCCUPATION: integer (nullable = true)
|-- LOT_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
|-- LOT_OCCUPATION_GROUP: integer (nullable = true)
|-- LOT_OCCUPATION_GROUP_NAME: string (nullable = true)
|-- LOT_V6_SPECIALIZED_OCCUPATION: integer (nullable = true)
|-- LOT_V6_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
|-- LOT_V6_OCCUPATION: integer (nullable = true)
|-- LOT_V6_OCCUPATION_NAME: string (nullable = true)
|-- LOT_V6_OCCUPATION_GROUP: integer (nullable = true)
|-- LOT_V6_OCCUPATION_GROUP_NAME: string (nullable = true)
|-- LOT_V6_CAREER_AREA: integer (nullable = true)
|-- LOT_V6_CAREER_AREA_NAME: string (nullable = true)
|-- SOC_2: string (nullable = true)
|-- SOC_2_NAME: string (nullable = true)
|-- SOC_3: string (nullable = true)
|-- SOC_3_NAME: string (nullable = true)
|-- SOC_4: string (nullable = true)
|-- SOC_4_NAME: string (nullable = true)
|-- SOC_5: string (nullable = true)
|-- SOC_5_NAME: string (nullable = true)
|-- LIGHTCAST_SECTORS: string (nullable = true)
|-- LIGHTCAST_SECTORS_NAME: string (nullable = true)
|-- NAICS_2022_2: integer (nullable = true)
|-- NAICS_2022_2_NAME: string (nullable = true)
|-- NAICS_2022_3: integer (nullable = true)
|-- NAICS_2022_3_NAME: string (nullable = true)
|-- NAICS_2022_4: integer (nullable = true)
|-- NAICS_2022_4_NAME: string (nullable = true)
|-- NAICS_2022_5: integer (nullable = true)
|-- NAICS_2022_5_NAME: string (nullable = true)
|-- NAICS_2022_6: integer (nullable = true)
|-- NAICS_2022_6_NAME: string (nullable = true)
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
| ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES| POSTED| EXPIRED|DURATION| SOURCE_TYPES| SOURCES| URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO| TITLE_RAW| BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY| COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM| LOCATION| CITY| CITY_NAME|COUNTY| COUNTY_NAME| MSA| MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING| MSA_NAME_OUTGOING|MSA_INCOMING| MSA_NAME_INCOMING|NAICS2| NAICS2_NAME|NAICS3| NAICS3_NAME|NAICS4| NAICS4_NAME|NAICS5| NAICS5_NAME|NAICS6| NAICS6_NAME| TITLE| TITLE_NAME| TITLE_CLEAN| SKILLS| SKILLS_NAME| SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME| CERTIFICATIONS| CERTIFICATIONS_NAME| COMMON_SKILLS| COMMON_SKILLS_NAME| SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME| ONET| ONET_NAME| ONET_2019| ONET_2019_NAME| CIP6| CIP6_NAME| CIP4| CIP4_NAME| CIP2| CIP2_NAME|SOC_2021_2| SOC_2021_2_NAME|SOC_2021_3| SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME| SOC_2| SOC_2_NAME| SOC_3| SOC_3_NAME| SOC_4| SOC_4_NAME| SOC_5| SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2| NAICS_2022_2_NAME|NAICS_2022_3| NAICS_2022_3_NAME|NAICS_2022_4| NAICS_2022_4_NAME|NAICS_2022_5| NAICS_2022_5_NAME|NAICS_2022_6| NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...| 9/6/2024| 2024-09-06 16:32:...| 0|6/2/2024| 6/8/2024| 6| [\n "Company"\n]|[\n "brassring.c...|[\n "https://sjo...| []| NULL|Enterprise Analys...|31-May-2024\n\nEn...| 6/8/2024| 6| 894731| Murphy USA| Murphy USA| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 2| 2| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR| 5139| Union, AR|20980| El Dorado, AR| 5| Arkansas| 5139| Union, AR| 5139| Union, AR| 20980| El Dorado, AR| 20980| El Dorado, AR| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n "KS126DB6T06...|[\n "Merchandisi...|[\n "KS126DB6T06...| [\n "Merchandisi...| []| []|[\n "KS126706DPF...|[\n "Mathematics...|[\n "KS440W865GC...|[\n "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "45.0601",\n...|[\n "Economics, ...|[\n "45.06",\n ...|[\n "Economics",...|[\n "45",\n "27...|[\n "Social Scie...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 7\n]| [\n "Artificial ...| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...| 441330|Automotive Parts ...|
|0cb072af26757b6c4...| 8/2/2024| 2024-08-02 13:08:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Job Board"\n]| [\n "maine.gov"\n]|[\n "https://job...| []| NULL|Oracle Consultant...|Oracle Consultant...| 8/1/2024| NULL| 133098|Smx Corporation L...| SMX| true| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| 3| false| NULL| 1| Remote| NULL| NULL| NULL|{\n "lat": 44.31...| QXVndXN0YSwgTUU=| Augusta, ME| 23011| Kennebec, ME|12300|Augusta-Watervill...| 23| Maine| 23011| Kennebec, ME| 23011| Kennebec, ME| 12300|Augusta-Watervill...| 12300|Augusta-Watervill...| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n "KS122626T55...|[\n "Procurement...|[\n "KS122626T55...| [\n "Procurement...| []| []| []| []|[\n "BGSBF3F508F...|[\n "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...|
|85318b12b3331fa49...| 9/6/2024| 2024-09-06 16:32:...| 1|6/2/2024| 7/7/2024| 35| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL| Data Analyst|Taking care of pe...| 6/10/2024| 8|39063746| Sedgwick| Sedgwick| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 5| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 32.77...| RGFsbGFzLCBUWA==| Dallas, TX| 48113| Dallas, TX|19100|Dallas-Fort Worth...| 48| Texas| 48113| Dallas, TX| 48113| Dallas, TX| 19100|Dallas-Fort Worth...| 19100|Dallas-Fort Worth...| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...|524291| Claims Adjusting|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS1218W78FG...|[\n "Management"...|[\n "ESF3939CE1F...| [\n "Exception R...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS126HY6YLT...|[\n "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...| 524291| Claims Adjusting|
|1b5c3941e54a1889e...| 9/6/2024| 2024-09-06 16:32:...| 1|6/2/2024|7/20/2024| 48| [\n "Job Board"\n]|[\n "disabledper...|[\n "https://www...| []| NULL|Sr. Lead Data Mgm...|About this role:\...| 6/12/2024| 10|37615159| Wells Fargo|Wells Fargo| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.44...| UGhvZW5peCwgQVo=| Phoenix, AZ| 4013| Maricopa, AZ|38060|Phoenix-Mesa-Chan...| 4| Arizona| 4013| Maricopa, AZ| 4013| Maricopa, AZ| 38060|Phoenix-Mesa-Chan...| 38060|Phoenix-Mesa-Chan...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking|522110| Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n "KS123QX62QY...|[\n "Exit Strate...|[\n "KS123QX62QY...| [\n "Exit Strate...| []| []|[\n "KS7G6NP6R6L...|[\n "Reliability...|[\n "KS4409D76NW...|[\n "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 6\n]| [\n "Data Privac...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking| 522110| Commercial Banking|
|cb5ca25f02bdf25c1...| 6/19/2024| 2024-06-19 03:00:00| 0|6/2/2024|6/17/2024| 15|[\n "FreeJobBoar...|[\n "craigslist....|[\n "https://mod...| []| NULL|Comisiones de $10...|Comisiones de $10...| 6/17/2024| 15| 0| Unclassified| LH/GM| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 3|Part-time / full-...| NULL| NULL| false| 92500| 0| [None]| year| 150000| 35000|{\n "lat": 37.63...| TW9kZXN0bywgQ0E=| Modesto, CA| 6099|Stanislaus, CA|33700| Modesto, CA| 6|California| 6099| Stanislaus, CA| 6099| Stanislaus, CA| 33700| Modesto, CA| 33700| Modesto, CA| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000| Unclassified|comisiones de por...| []| []| []| []| []| []| []| []| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
1 Salary Distribution by Employment Type
- Identify salary trends across different employment types.
- Filter the dataset
- Remove records where salary is missing or zero.
- Aggregate Data
- Group by employment type and compute salary distribution.
- Visualize results
- Create a box plot where:
- X-axis =
EMPLOYMENT_TYPE_NAME - Y-axis =
SALARY_FROM
- X-axis =
- Customize colors, fonts, and styles to avoid a 2.5-point deduction.
- Create a box plot where:
- Explanation: Write two sentences about what the graph reveals.
os.makedirs("_output", exist_ok=True)
df = df.replace(
{"Part-time (≤ 32 hours)": "Part-time (less than 32 hours)"},
subset=["EMPLOYMENT_TYPE_NAME"]
)
df_salary = df.filter((df["SALARY_FROM"].isNotNull()) & (df["SALARY_FROM"] > 0))
df_salary_pd = df_salary.select("EMPLOYMENT_TYPE_NAME", "SALARY_FROM").toPandas()
fig = px.box(
df_salary_pd,
x="EMPLOYMENT_TYPE_NAME",
y="SALARY_FROM",
title="Salary Distribution by Employment Type",
labels={"SALARY_FROM": "Salary", "EMPLOYMENT_TYPE_NAME": "Employment Type"},
template="plotly_white",
color="EMPLOYMENT_TYPE_NAME",
color_discrete_sequence=["royalblue", "lightseagreen", "tomato"]
)
fig.update_layout(
font=dict(family="Arial", size=14),
title_font_size=20,
xaxis_title_font_size=16,
yaxis_title_font_size=16
)
fig.show()
fig.write_image("_output/salary_distribution_by_employment_type.svg")1.0.1 Interpretation:
The box plot shows that full-time employment types (working more than 32 hours) offer higher median salaries compared to part-time or mixed employment roles. Part-time positions, especially those working fewer than 32 hours, tend to have lower and more tightly clustered salary distributions, indicating less variation and lower earning potential.
2 Salary Distribution by Industry
- Compare salary variations across industries.
- Filter the dataset
- Keep records where salary is greater than zero.
- Aggregate Data
- Group by NAICS industry codes.
- Visualize results
- Create a box plot where:
- X-axis =
NAICS2_NAME - Y-axis =
SALARY_FROM
- X-axis =
- Customize colors, fonts, and styles.
- Create a box plot where:
- Explanation: Write two sentences about what the graph reveals.
df_salary = df.filter((df["SALARY_FROM"].isNotNull()) & (df["SALARY_FROM"] > 0))
df_salary_pd = df_salary.select("NAICS2_NAME", "SALARY_FROM").toPandas()
fig = px.box(
df_salary_pd,
x="NAICS2_NAME",
y="SALARY_FROM",
color="NAICS2_NAME",
title="Salary Distribution by Industry",
labels={"SALARY_FROM": "Salary", "NAICS2_NAME": "Industry"},
template="plotly_white",
color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.update_layout(
font=dict(family="Arial", size=14),
title_font_size=20,
xaxis_title_font_size=16,
yaxis_title_font_size=16
)
fig.show()
fig.write_image("_output/salary_distribution_by_industry.svg")
2.0.1 Interpretation:
The box plot reveals that industries like Information and Professional, Scientific, and Technical Services have the highest salary medians and the widest salary ranges. In contrast, sectors such as Accommodation and Food Services and Arts, Entertainment, and Recreation show lower and more tightly clustered salary distributions.
3 Job Posting Trends Over Time
- Analyze how job postings fluctuate over time.
- Aggregate Data
- Count job postings per posted date (
POSTED).
- Count job postings per posted date (
- Visualize results
- Create a line chart where:
- X-axis =
POSTED - Y-axis =
Number of Job Postings
- X-axis =
- Apply custom colors and font styles.
- Create a line chart where:
- Explanation: Write two sentences about what the graph reveals.
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
df = df.withColumn("POSTED_DATE", to_date(col("POSTED"), "MM/dd/yyyy"))
df_postings = df.groupBy("POSTED_DATE").agg(count("*").alias("num_postings"))
df_postings_pd = df_postings.orderBy("POSTED_DATE").toPandas()
fig = px.line(
df_postings_pd,
x="POSTED_DATE",
y="num_postings",
title="Job Posting Trends Over Time",
labels={"POSTED_DATE": "Posted Date", "num_postings": "Number of Job Postings"},
template="plotly_white",
line_shape="linear",
color_discrete_sequence=["royalblue"]
)
fig.update_layout(
font=dict(family="Arial", size=14),
title_font_size=20,
xaxis_title_font_size=16,
yaxis_title_font_size=16,
plot_bgcolor="white"
)
fig.show()
fig.write_image("_output/job_posting_trends_over_time.svg")3.0.1 Interpretation:
The line chart shows that job postings fluctuate heavily over time, with frequent sharp increases and decreases in the number of postings. This pattern suggests a cyclical or weekly trend, where posting activity varies consistently across different days or weeks.
4 Top 10 Job Titles by Count
- Identify the most frequently posted job titles.
- Aggregate Data
- Count the occurrences of each job title (
TITLE_NAME). - Select the top 10 most frequent titles.
- Count the occurrences of each job title (
- Visualize results
- Create a bar chart where:
- X-axis =
TITLE_NAME - Y-axis =
Job Count
- X-axis =
- Apply custom colors and font styles.
- Create a bar chart where:
- Explanation: Write two sentences about what the graph reveals.
df_titles = df.groupBy("TITLE_NAME").agg(count("*").alias("job_count"))
df_top_titles = df_titles.orderBy(col("job_count").desc()).limit(10)
df_top_titles_pd = df_top_titles.toPandas()
df_top_titles_pd["color_group"] = ["Top 1" if i == 0 else "Other" for i in range(len(df_top_titles_pd))]
fig = px.bar(
df_top_titles_pd,
x="TITLE_NAME",
y="job_count",
title="Top 10 Job Titles by Count",
labels={"TITLE_NAME": "Job Title", "job_count": "Job Count"},
color="color_group",
color_discrete_map={
"Top 1": "darkorange",
"Other": "mediumseagreen"
},
template="plotly_white"
)
fig.update_layout(
font=dict(family="Arial", size=14),
title_font_size=20,
xaxis_title_font_size=16,
yaxis_title_font_size=16
)
fig.show()
fig.write_image("_output/top_10_job_titles_by_count.svg")4.0.1 Interpretation:
The bar chart shows that “Data Analysis” is by far the most frequently posted job title, significantly outpacing all other roles. Other common titles such as “Business Intelligence Analyst” and “Enterprise Architects” also appear frequently, reflecting the strong demand for data-driven and technical expertise in the job market.
5 Remote vs On-Site Job Postings
- Compare the proportion of remote and on-site job postings.
- Aggregate Data
- Count job postings by remote type (
REMOTE_TYPE_NAME).
- Count job postings by remote type (
- Visualize results
- Create a pie chart where:
- Labels =
REMOTE_TYPE_NAME - Values =
Job Count
- Labels =
- Apply custom colors and font styles.
- Create a pie chart where:
- Explanation: Write two sentences about what the graph reveals.
df_remote = df.groupBy("REMOTE_TYPE_NAME").agg(count("*").alias("job_count"))
df_remote_pd = df_remote.toPandas()
fig = px.pie(
df_remote_pd,
names="REMOTE_TYPE_NAME",
values="job_count",
title="Remote vs On-Site Job Postings",
template="plotly_white",
color_discrete_map={
"On-Site": "royalblue",
"Remote": "mediumseagreen",
"Hybrid": "gold"
}
)
fig.update_layout(
font=dict(family="Arial", size=14),
title_font_size=20
)
fig.show()
fig.write_image("_output/remote_vs_onsite_job_postings.svg")5.0.1 Interpretation:
The pie chart shows that a large majority of job postings do not specify a remote or on-site status, with remote roles accounting for about 17% of the total. Hybrid remote opportunities are relatively rare, suggesting that while remote work options exist, most postings either remain unspecified or are still tied to traditional work models.
6 Skill Demand Analysis by Industry (Stacked Bar Chart)
- Identify which skills are most in demand in various industries.
- Aggregate Data
- Extract skills from job postings.
- Count occurrences of skills grouped by NAICS industry codes.
- Visualize results
- Create a stacked bar chart where:
- X-axis =
Industry - Y-axis =
Skill Count - Color =
Skill
- X-axis =
- Apply custom colors and font styles.
- Create a stacked bar chart where:
- Explanation: Write two sentences about what the graph reveals.
df_skills = df.filter(col("SKILLS_NAME").isNotNull())
df_skills = df_skills.withColumn("SKILL", explode(split(col("SKILLS_NAME"), r",\s*")))
df_skill_industry = df_skills.groupBy("NAICS2_NAME", "SKILL").agg(count("*").alias("skill_count"))
windowSpec = Window.partitionBy("NAICS2_NAME").orderBy(col("skill_count").desc())
df_skill_industry_ranked = df_skill_industry.withColumn("rank", row_number().over(windowSpec)).filter(col("rank") <= 5)
df_skill_industry_pd = df_skill_industry_ranked.toPandas()
fig = px.bar(
df_skill_industry_pd,
x="NAICS2_NAME",
y="skill_count",
color="SKILL",
title="Top 5 Skills by Industry",
labels={"NAICS2_NAME": "Industry", "skill_count": "Skill Count", "SKILL": "Skill"},
template="plotly_white",
color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.update_layout(
font=dict(family="Arial", size=14),
title_font_size=20,
xaxis_title_font_size=16,
yaxis_title_font_size=16,
barmode='stack'
)
fig.show()
fig.write_image("_output/skill_demand_analysis_by_industry.svg")6.0.1 Interpretation
The stacked bar chart shows that core skills such as Management, Leadership, and Communication are consistently in high demand across a wide range of industries. Technical and specialized skills, like Agile Methodology and Risk Analysis, appear prominently in specific sectors, reflecting targeted skill needs based on industry specialization.
7 Salary Analysis by ONET Occupation Type (Bubble Chart)
- Analyze how salaries differ across ONET occupation types.
- Aggregate Data
- Compute median salary for each occupation in the ONET taxonomy.
- Visualize results
- Create a bubble chart where:
- X-axis =
ONET_NAME - Y-axis =
Median Salary - Size = Number of job postings
- X-axis =
- Apply custom colors and font styles.
- Create a bubble chart where:
- Explanation: Write two sentences about what the graph reveals.
df_clean = df.filter((col("SALARY_FROM").isNotNull()) & (col("SALARY_FROM") > 0))
df_clean.select("ONET_NAME").distinct().show(20, truncate=False)
df_onet = df_clean.groupBy("ONET_NAME").agg(
expr("percentile_approx(SALARY_FROM, 0.5)").alias("median_salary"),
count("*").alias("num_postings")
)
df_onet = df_onet.filter(col("median_salary").isNotNull())
df_onet_pd = df_onet.toPandas()
fig = px.scatter(
df_onet_pd,
x="ONET_NAME",
y="median_salary",
size="num_postings",
title="Salary Analysis by ONET Occupation Type",
labels={"ONET_NAME": "Occupation", "median_salary": "Median Salary", "num_postings": "Number of Job Postings"},
template="plotly_white",
color_discrete_sequence=["royalblue"]
)
fig.update_layout(
font=dict(family="Arial", size=14),
title_font_size=20,
xaxis_title_font_size=16,
yaxis_title_font_size=16,
xaxis_tickangle=-45
)
fig.show()
fig.write_image("_output/salary_analysis_by_onet_occupation.svg")+------------------------------+
|ONET_NAME |
+------------------------------+
|Business Intelligence Analysts|
+------------------------------+
7.0.1 Interpretation:
The bubble chart shows that Business Intelligence Analysts is the only ONET occupation with valid salary information in the dataset. This indicates that most job postings either did not report salary data or had missing occupation details, limiting a broader salary comparison across occupations.
8 Career Pathway Trends (Sankey Diagram)
- Visualize job transitions between different occupation levels.
- Aggregate Data
- Identify career transitions between SOC job classifications.
- Visualize results
- Create a Sankey diagram where:
- Source =
SOC_2021_2_NAME - Target =
SOC_2021_3_NAME - Value = Number of transitions
- Source =
- Apply custom colors and font styles.
- Create a Sankey diagram where:
- Explanation: Write two sentences about what the graph reveals.
df_sankey = df.groupBy("SOC_2021_2_NAME", "SOC_2021_3_NAME").agg(count("*").alias("count"))
df_sankey_pd = df_sankey.toPandas()
all_nodes = list(pd.unique(df_sankey_pd[["SOC_2021_2_NAME", "SOC_2021_3_NAME"]].values.ravel()))
node_indices = {name: i for i, name in enumerate(all_nodes)}
df_sankey_pd["source_id"] = df_sankey_pd["SOC_2021_2_NAME"].map(node_indices)
df_sankey_pd["target_id"] = df_sankey_pd["SOC_2021_3_NAME"].map(node_indices)
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=10,
thickness=10,
line=dict(color="black", width=0.5),
label=all_nodes,
color="royalblue"
),
link=dict(
source=df_sankey_pd["source_id"],
target=df_sankey_pd["target_id"],
value=df_sankey_pd["count"],
color="lightblue"
)
)])
fig.update_layout(
title_text="Career Pathway Trends (SOC Codes)",
font=dict(family="Arial", size=12),
title_font_size=20,
height=400,
width=800
)
fig.show()
fig.write_image("_output/career_pathway_trends.svg")8.0.1 Interpretation:
The Sankey diagram shows that the primary career transition recorded in the dataset occurs between Computer and Mathematical Occupations and Mathematical Science Occupations. This suggests a strong internal mobility trend within mathematical and technical fields, although broader career transitions were limited in the dataset.